In [1]:
import graphlab as gl
from nltk.stem import *
In [2]:
train = gl.SFrame.read_csv("../data/train.csv")
In [3]:
test = gl.SFrame.read_csv("../data/test.csv")
In [4]:
desc = gl.SFrame.read_csv("../data/product_descriptions.csv")
In [5]:
# merge train with description
train = train.join(desc, on = 'product_uid', how = 'left')
In [6]:
# merge test with description
test = test.join(desc, on = 'product_uid', how = 'left')
Let's examine 3 different queries and products:
In [7]:
first_doc = train[0]
first_doc
Out[7]:
'angle bracket' search term is not contained in the body. 'angle' would be after stemming however 'bracket' is not.
In [8]:
middle_doc = train[37033]
middle_doc
Out[8]:
only 'wood' is present from search term
In [9]:
last_doc = train[-1]
last_doc
Out[9]:
'sheer' and 'courtain' are present and that's all
In [10]:
train['search_term_word_count'] = gl.text_analytics.count_words(train['search_term'])
ranked3doc = train[train['relevance'] == 3]
print ranked3doc.head()
len(ranked3doc)
Out[10]:
In [11]:
words_search = gl.text_analytics.tokenize(ranked3doc['search_term'], to_lower = True)
words_description = gl.text_analytics.tokenize(ranked3doc['product_description'], to_lower = True)
words_title = gl.text_analytics.tokenize(ranked3doc['product_title'], to_lower = True)
wordsdiff_desc = []
wordsdiff_title = []
puid = []
search_term = []
ws_count = []
ws_count_used_desc = []
ws_count_used_title = []
for item in xrange(len(ranked3doc)):
ws = words_search[item]
pd = words_description[item]
pt = words_title[item]
diff = set(ws) - set(pd)
if diff is None:
diff = 0
wordsdiff_desc.append(diff)
diff2 = set(ws) - set(pt)
if diff2 is None:
diff2 = 0
wordsdiff_title.append(diff2)
puid.append(ranked3doc[item]['product_uid'])
search_term.append(ranked3doc[item]['search_term'])
ws_count.append(len(ws))
ws_count_used_desc.append(len(ws) - len(diff))
ws_count_used_title.append(len(ws) - len(diff2))
differences = gl.SFrame({"puid" : puid,
"search term": search_term,
"diff desc" : wordsdiff_desc,
"diff title" : wordsdiff_title,
"ws count" : ws_count,
"ws count used desc" : ws_count_used_desc,
"ws count used title" : ws_count_used_title})
In [12]:
differences.sort(['ws count used desc', 'ws count used title'])
Out[12]:
In [13]:
print "No terms used in description : " + str(len(differences[differences['ws count used desc'] == 0]))
print "No terms used in title : " + str(len(differences[differences['ws count used title'] == 0]))
print "No terms used in description and title : " + str(len(differences[(differences['ws count used desc'] == 0) &
(differences['ws count used title'] == 0)]))
In [14]:
import matplotlib.pyplot as plt
%matplotlib inline
In [15]:
#stemmer = SnowballStemmer("english")
stemmer = PorterStemmer()
def stem(word):
singles = [stemmer.stem(plural) for plural in unicode(word, errors='replace').split()]
text = ' '.join(singles)
return text
In [16]:
print "Starting stemming train search term..."
stemmed = train['search_term'].apply(stem)
train['stem_search_term'] = stemmed
print "Starting stemming train product description..."
stemmed = train['product_description'].apply(stem)
train['stem_product_description'] = stemmed
print "Starting stemming train product title..."
stemmed = train['product_title'].apply(stem)
train['stem_product_title'] = stemmed
print "Starting stemming test search term..."
stemmed = test['search_term'].apply(stem)
test['stem_search_term'] = stemmed
print "Starting stemming test product description..."
stemmed = test['product_description'].apply(stem)
test['stem_product_description'] = stemmed
print "Starting stemming test product title..."
stemmed = test['product_title'].apply(stem)
test['stem_product_title'] = stemmed
In [26]:
train['stem_search_term_split'] = train['stem_search_term'].apply(lambda x: x.split())
train['stem_product_title_split'] = train['stem_product_title'].apply(lambda x: x.split())
In [33]:
train_bm25_title = gl.text_analytics.bm25(train['stem_product_title_split'], train['stem_search_term'])
In [35]:
train_bm25_title
Out[35]:
In [19]:
train['product_desc_word_count'] = gl.text_analytics.count_words(train['stem_product_description'])
train_desc_tfidf = gl.text_analytics.tf_idf(train['product_desc_word_count'])
In [20]:
train['desc_tfidf'] = train_desc_tfidf
In [21]:
train['product_title_word_count'] = gl.text_analytics.count_words(train['stem_product_title'])
train_title_tfidf = gl.text_analytics.tf_idf(train['product_title_word_count'])
train['title_tfidf'] = train_title_tfidf
In [22]:
train['distance_desc'] = train.apply(lambda x: gl.distances.cosine(x['search_tfidf'],x['desc_tfidf']))
#train['distance_desc_sqrt'] = train['distance_desc'] ** 2
train['distance_title'] = train.apply(lambda x: gl.distances.cosine(x['search_tfidf'],x['title_tfidf']))
#train['distance_title_sqrt'] = train['distance_title'] ** 3
In [36]:
model1 = gl.random_forest_regression.create(train, target = 'relevance',
features = ['distance_desc', 'distance_title'],
num_trees = 500,
validation_set = None)
In [37]:
test['search_term_word_count'] = gl.text_analytics.count_words(test['stem_search_term'])
test_search_tfidf = gl.text_analytics.tf_idf(test['search_term_word_count'])
test['search_tfidf'] = test_search_tfidf
test['product_desc_word_count'] = gl.text_analytics.count_words(test['stem_product_description'])
test_desc_tfidf = gl.text_analytics.tf_idf(test['product_desc_word_count'])
test['desc_tfidf'] = test_desc_tfidf
test['product_title_word_count'] = gl.text_analytics.count_words(test['stem_product_title'])
test_title_tfidf = gl.text_analytics.tf_idf(test['product_title_word_count'])
test['title_tfidf'] = test_title_tfidf
test['distance_desc'] = test.apply(lambda x: gl.distances.cosine(x['search_tfidf'],x['desc_tfidf']))
#test['distance_desc_sqrt'] = test['distance_desc'] ** 2
test['distance_title'] = test.apply(lambda x: gl.distances.cosine(x['search_tfidf'],x['title_tfidf']))
#test['distance_title_sqrt'] = test['distance_title'] ** 3
In [38]:
'''
predictions_test = model1.predict(test)
test_errors = predictions_test - test['relevance']
RSS_test = sum(test_errors * test_errors)
print RSS_test
'''
Out[38]:
In [39]:
predictions_test = model1.predict(test)
predictions_test
Out[39]:
In [40]:
#result = model1.evaluate(test)
#result
In [41]:
submission = gl.SFrame(test['id'])
In [42]:
submission.add_column(predictions_test)
submission.rename({'X1': 'id', 'X2':'relevance'})
Out[42]:
In [43]:
submission['relevance'] = submission.apply(lambda x: 3.0 if x['relevance'] > 3.0 else x['relevance'])
submission['relevance'] = submission.apply(lambda x: 1.0 if x['relevance'] < 1.0 else x['relevance'])
In [44]:
submission['relevance'] = submission.apply(lambda x: str(x['relevance']))
In [45]:
submission.export_csv('../data/submission2.csv', quote_level = 3)
In [ ]:
#gl.canvas.set_target('ipynb')